/* Copyright (c) 2003 The Nutch Organization. All rights reserved. */
/* Use subject to the conditions in http://www.nutch.org/LICENSE.txt. */
package net.nutch.indexer;
import java.util.*;
import java.io.*;
import org.apache.lucene.util.*;
import org.apache.lucene.util.PriorityQueue;
import org.apache.lucene.document.*;
import org.apache.lucene.index.*;
import org.apache.lucene.search.*;
/** */
public class IndexOptimizer {
public static final String DONE_NAME = "optimize.done";
private static final float IDF_THRESHOLD = 6.0f;
private static final float FRACTION = 0.1f;
private static class FilterTermDocs implements TermDocs {
protected TermDocs in;
public FilterTermDocs(TermDocs in) { this.in = in; }
public void seek(Term term) throws IOException { in.seek(term); }
public void seek(TermEnum enum_) throws IOException { in.seek(enum_); }
public int doc() { return in.doc(); }
public int freq() { return in.freq(); }
public boolean next() throws IOException { return in.next(); }
public int read(int[] docs, int[] freqs) throws IOException {
return in.read(docs, freqs);
}
public boolean skipTo(int i) throws IOException { return in.skipTo(i); }
public void close() throws IOException { in.close(); }
}
private static class FilterTermPositions
extends FilterTermDocs implements TermPositions {
public FilterTermPositions(TermPositions in) { super(in); }
public int nextPosition() throws IOException {
return ((TermPositions)in).nextPosition();
}
}
private static class FilterTermEnum extends TermEnum {
protected TermEnum in;
public FilterTermEnum(TermEnum in) { this.in = in; }
public boolean next() throws IOException { return in.next(); }
public Term term() { return in.term(); }
public int docFreq() { return in.docFreq(); }
public void close() throws IOException { in.close(); }
}
private static class OptimizingTermEnum extends FilterTermEnum {
private IndexReader reader;
private Similarity similarity;
public OptimizingTermEnum(IndexReader reader, Similarity similarity)
throws IOException {
super(reader.terms());
this.reader = reader;
this.similarity = similarity;
}
public boolean next() throws IOException {
while (in.next()) {
float idf = similarity.idf(in.docFreq(), reader.maxDoc());
if (idf <= IDF_THRESHOLD)
return true;
}
return false;
}
}
private static class ScoreDocQueue extends PriorityQueue {
ScoreDocQueue(int size) {
initialize(size);
}
protected final boolean lessThan(Object a, Object b) {
ScoreDoc hitA = (ScoreDoc)a;
ScoreDoc hitB = (ScoreDoc)b;
if (hitA.score == hitB.score)
return hitA.doc > hitB.doc;
else
return hitA.score < hitB.score;
}
}
private static class OptimizingTermPositions extends FilterTermPositions {
private IndexReader reader;
private TermDocs termDocs;
private int docFreq;
private ScoreDocQueue sdq;
private BitSet docs;
private Similarity similarity;
public OptimizingTermPositions(IndexReader reader, Similarity similarity)
throws IOException {
super(reader.termPositions());
this.reader = reader;
this.termDocs = reader.termDocs();
this.similarity = similarity;
this.sdq = new ScoreDocQueue((int)Math.ceil(reader.maxDoc() * FRACTION));
this.docs = new BitSet(reader.maxDoc());
}
public void seek(TermEnum enum_) throws IOException {
super.seek(enum_);
termDocs.seek(enum_);
byte[] norms = reader.norms(enum_.term().field());
sdq.clear();
float minScore = 0.0f;
int count = (int)Math.ceil(enum_.docFreq() * FRACTION);
System.out.println("Optimizing " + enum_.term()
+ " from " + enum_.docFreq()
+ " to " + count);
while (termDocs.next()) {
int doc = termDocs.doc();
float score =
similarity.tf(termDocs.freq()) * similarity.decodeNorm(norms[doc]);
if (score > minScore) {
sdq.put(new ScoreDoc(doc, score));
if (sdq.size() > count) { // if sdq overfull
sdq.pop(); // remove lowest in sdq
minScore = ((ScoreDoc)sdq.top()).score; // reset minScore
}
}
}
docs.clear();
while (sdq.size() != 0) {
docs.set(((ScoreDoc)sdq.pop()).doc);
}
}
public boolean next() throws IOException {
while (in.next()) {
if (docs.get(in.doc()))
return true;
}
return false;
}
}
private static class OptimizingReader extends FilterIndexReader {
private Similarity similarity = new NutchSimilarity();
public OptimizingReader(IndexReader reader) {
super(reader);
}
// don't copy any per-document data
public int numDocs() { return 0; }
public int maxDoc() { return 0; }
// filter out low frequency terms
public TermEnum terms() throws IOException {
return new OptimizingTermEnum(in, similarity);
}
// filter out low-scoring postings
public TermPositions termPositions() throws IOException {
return new OptimizingTermPositions(in, similarity);
}
public boolean hasDeletions() { return false; }
}
private File directory;
public IndexOptimizer(File directory) {
this.directory = directory;
}
public void optimize() throws IOException {
IndexReader reader = IndexReader.open(new File(directory, "index"));
OptimizingReader optimizer = new OptimizingReader(reader);
IndexWriter writer = new IndexWriter(new File(directory, "index-opt"),
null, true);
writer.addIndexes(new IndexReader[] { optimizer });
}
/** */
public static void main(String[] args) throws Exception {
File directory;
String usage = "IndexOptimizer directory";
if (args.length < 1) {
System.err.println("Usage: " + usage);
return;
}
directory = new File(args[0]);
IndexOptimizer optimizer = new IndexOptimizer(directory);
Date start = new Date();
optimizer.optimize();
Date end = new Date();
System.out.print(end.getTime() - start.getTime());
System.out.println(" total milliseconds");
}
}